This was an early look at the Hard to Count scores from the census planning database, and an experiment in using tmap.
Note you will need to set the global settings to use the project directory when knitting, as documented in figure 16.1: https://bookdown.org/yihui/rmarkdown-cookbook/working-directory.html.
Note, geneorama is needed, and can be installed with devtools::install_github("geneorama/geneorama"). It’s only needed for a handful of functions, but there are also some included maps, like community areas.
##==============================================================================
## INITIALIZE
##==============================================================================
# rm(list=ls())
library("geneorama")
library("data.table")
library("rgeos")
library("leaflet")
library("colorspace")
library("sp")
library("rgdal")
library("RColorBrewer")
library("magrittr")
## Not used, this time:
# library("spdep")
# library("ggplot2")
# library("bit64")
## Load the functions in the functions directory with sourceDir
sourceDir("functions/")
The CA data is from the geneorama package. The census tracts are from another repository since they were not released by the census at the time this document was created.
##==============================================================================
## IMPORT DATA
##==============================================================================
data("chi_community_areas")
## 2020 Tract data
shp_tracts_2020 <- readRDS("data_maps_census_2020/tracts_2020_stuartlynn_Cook.Rds")
## Subset Tract data to Chicago
shp_tracts_2020 <- shp_tracts_2020[!is.na(shp_tracts_2020$community), ]
shp_tracts_2020$GEOID <- substr(shp_tracts_2020$GEO_ID, 10, 20)
Cross walks are needed to translate 2020 response rates to 2019 tracts. Only a few tracts changed, but there was no way to know how many or if the effect was concentrated ahead of time.
Also tracts need to be allocated to ward. Ward boundaries cut tracts in half often.
To do the allocation we used a private data set from Replica HQ, which has simulated population data at an individual level. The person level data is aggregated to household based on location, and those households are geocoded to geographies to come up with the translations between geographies.
##------------------------------------------------------------------------------
## Tract crosswalk based on replica data
##------------------------------------------------------------------------------
to2020 <- fread("data_census_planning/crosswalk_from_2020.csv")
to2020[ , TRACT_2020 := as.character(TRACT_2020)]
to2020[ , TRACT_prev := as.character(TRACT_prev)]
to2020[ , TRACT_2020 := substr(TRACT_2020,6,11)]
to2020[ , TRACT_prev := substr(TRACT_prev,6,11)]
to2020 <- to2020[ , list(TRACT_2020=TRACT_2020[which.max(allocation)]), TRACT_prev]
##------------------------------------------------------------------------------
## Ward crosswalk based on replica data
##------------------------------------------------------------------------------
toWard <- fread("data_census_planning/crosswalk_replica_based.csv")
toWard[ , TRACT:=substr(TRACT,6,11)]
toWard <- toWard[!is.na(TRACT)]
# toWard <- dcast(toWard, TRACT~ward, value.var = "allocation", fill = 0)
Load the locally captured response rate data. This is collected from the Census API on a daily basis using the script in this package, and a cron job.
##------------------------------------------------------------------------------
## Locally collected responses for 2020
##------------------------------------------------------------------------------
resp_filename <- max(list.files(path = "data_daily_resp_cook",
pattern = "^cook.+csv$", full.names = T))
resp_current <- fread(resp_filename)
resp_current[ , tract := NULL]
resp_current[ , GEOID := substr(GEO_ID, 10, 20)]
resp_current[ , TRACT := substr(GEO_ID, 15, 20)]
resp_current <- resp_current[match(shp_tracts_2020$TRACT, TRACT)]
## File name for current report:
resp_filename
## [1] "data_daily_resp_cook/cook 2020-09-07.csv"
## Head of current response rate data
head(resp_current)
## GEO_ID RESP_DATE DRRALL CRRALL DRRINT CRRINT state county
## 1: 1400000US17031380200 2020-09-04 0.2 65.4 0.1 26.3 17 31
## 2: 1400000US17031611500 2020-09-04 0.1 42.1 0.1 24.2 17 31
## 3: 1400000US17031711100 2020-09-04 0.0 63.6 0.0 35.0 17 31
## 4: 1400000US17031420300 2020-09-04 0.0 54.4 0.0 48.6 17 31
## 5: 1400000US17031283200 2020-09-04 0.0 68.3 0.0 62.1 17 31
## 6: 1400000US17031590600 2020-09-04 0.1 52.4 0.1 45.3 17 31
## GEOID TRACT
## 1: 17031380200 380200
## 2: 17031611500 611500
## 3: 17031711100 711100
## 4: 17031420300 420300
## 5: 17031283200 283200
## 6: 17031590600 590600
The planning database contains many tract level variables on populations across ACS and Census surveys. It was available among the census 2020 planning materials.
##------------------------------------------------------------------------------
## pdb file from CB
## Use cross walk to link to 2020 tracts
##------------------------------------------------------------------------------
# file.copy("../data-census-planning/pdb2019bgv6_us.csv",
# "data_census_planning/")
# str(pdb)
## NOTE: MANUALLY LIMITED TO COOK COUNTY FOR GITHUB SIZE LIMIT
pdb <- fread("data_census_planning/pdb2019bgv6_cook.csv", keepLeadingZeros = T)
pdb <- pdb[State=="17" & County == "031"]
# table(unique(pdb$Tract) %in% to2020$TRACT_prev)
# table(unique(pdb$Tract) %in% to2020$TRACT_2020)
pdb[ , TRACT_2020 := to2020[match(pdb$Tract, TRACT_prev), TRACT_2020]]
Test overlap and coverage between shapefile and planning database:
table(shp_tracts_2020$TRACT %in% pdb$TRACT_2020)
##
## FALSE TRUE
## 12 779
table(pdb$TRACT_2020 %in% shp_tracts_2020$TRACT)
##
## FALSE TRUE
## 1847 2146
table(unique(pdb$TRACT_2020) %in% shp_tracts_2020$TRACT)
##
## FALSE TRUE
## 1 779
Although the overlap isn’t perfect, other analysis indicates that the errors are not significant. Most of tracts with error are unpopulated, or very sparsely populated; for example airports.
pdb <- pdb[pdb$TRACT_2020 %in% shp_tracts_2020$TRACT]
dim(pdb)
## [1] 2146 347
## Estimates of total population
sum(pdb$Tot_Population_CEN_2010) ## 2,695,249
## [1] 2644979
sum(pdb$Tot_Population_ACS_13_17) ## 2,722,098
## [1] 2669418
sum(pdb$Tot_Population_ACSMOE_13_17) ## 705,132 (?)
## [1] 693327
## Estimates for total households
sum(pdb$Tot_Housing_Units_CEN_2010) ## 1,194,116
## [1] 1164503
sum(pdb$Tot_Housing_Units_ACS_13_17) ## 1,200,059
## [1] 1168850
sum(pdb$Tot_Housing_Units_ACSMOE_13_17) ## 203,658 (?)
## [1] 198935
Geocode tracts to Community Area for the shapefile, then match the results to the other planning and response rate files.
##------------------------------------------------------------------------------
## Get community area
##------------------------------------------------------------------------------
## Geocode tracts to Community Area
shp_tracts_2020$community <- geocode_to_map(shp_tracts_2020$lat_centroid,
shp_tracts_2020$lon_centroid,
map = chi_community_areas,
map_field_name = "community")
## Match to planning database
pdb$community <- shp_tracts_2020$community[match(pdb$TRACT_2020, shp_tracts_2020$TRACT)]
## Match to current response
resp_current$community <- shp_tracts_2020$community[match(resp_current$TRACT,
shp_tracts_2020$TRACT)]
The census is conducted at a household level, so aggregage key statistics of the planning db to a household level.
Once this is done, join in the response rate data so that everything is in one place.
##------------------------------------------------------------------------------
## Aggregate planning database statistics to 2020 tract
##------------------------------------------------------------------------------
pdb_household <- pdb[i = TRUE,
j = list(households = sum(Tot_Housing_Units_ACS_13_17),
Tot_Population_ACS_13_17 = sum(Tot_Population_ACS_13_17),
Tot_Occp_Units_ACS_13_17 = sum(Tot_Occp_Units_ACS_13_17),
Hispanic_ACS_13_17 = sum(Hispanic_ACS_13_17),
pct_hisp = sum(Hispanic_ACS_13_17) / sum(Tot_Population_ACS_13_17)),
keyby = list(TRACT = TRACT_2020)]
pdb_household
## TRACT households Tot_Population_ACS_13_17 Tot_Occp_Units_ACS_13_17
## 1: 010100 2626 4444 2248
## 2: 010201 2994 7197 2670
## 3: 010202 1216 2487 976
## 4: 010300 3395 6413 2982
## 5: 010400 2218 5411 1870
## ---
## 775: 843700 1004 2549 963
## 776: 843800 847 1699 630
## 777: 843900 2267 3332 1842
## 778: 980000 0 0 0
## 779: 980100 0 0 0
## Hispanic_ACS_13_17 pct_hisp
## 1: 410 0.09225923
## 2: 1627 0.22606642
## 3: 717 0.28829916
## 4: 1204 0.18774365
## 5: 390 0.07207540
## ---
## 775: 680 0.26677128
## 776: 409 0.24072984
## 777: 108 0.03241297
## 778: 0 NaN
## 779: 0 NaN
## Join in response rates and community area names
ii <- match(pdb_household$TRACT, resp_current$TRACT)
pdb_household$response <- resp_current[ii, CRRALL]
pdb_household$community <- resp_current[ii, community]
One of the big questions was how language barriers would affect outreach, so we looked at how many households are Hispanic. Other languages were also examined, but this was a good starting point.
hist(pdb_household$pct_hisp)
What is the distribution of the current response rate?
hist(pdb_household$response)
##------------------------------------------------------------------------------
## Aggregate response data to community area
##------------------------------------------------------------------------------
summary_community <- pdb_household[
i = !is.na(response),
j = list(resp = round(sum(response * households)/sum(households)/100, 2),
pop = sum(Tot_Population_ACS_13_17),
occ_households = sum(Tot_Occp_Units_ACS_13_17),
tot_households = sum(households),
hisp_pct = round(sum(pct_hisp * households) / sum(households), 2)),
keyby = community]
setnames(summary_community, gsub("_"," ",colnames(summary_community)))
setnames(summary_community, capwords(colnames(summary_community)))
# clipper(summary_community) ## COPY TO CLIPBOARD FOR EXCEL
summary_community
## Community Resp Pop Occ Households Tot Households Hisp Pct
## 1: <NA> 0.63 69100 40300 45426 0.07
## 2: ALBANY PARK 0.61 51992 16563 18240 0.48
## 3: ARCHER HEIGHTS 0.57 13169 3900 4291 0.78
## 4: ARMOUR SQUARE 0.59 13396 5192 5708 0.04
## 5: ASHBURN 0.75 43792 12982 13612 0.37
## 6: AUBURN GRESHAM 0.57 46278 17111 20614 0.02
## 7: AUSTIN 0.53 95260 31766 37248 0.13
## 8: AVALON PARK 0.66 10034 3898 4574 0.01
## 9: AVONDALE 0.55 37368 13335 15109 0.57
## 10: BELMONT CRAGIN 0.53 79956 22131 24407 0.80
## 11: BEVERLY 0.77 20822 7673 8048 0.05
## 12: BRIDGEPORT 0.57 33696 12751 13845 0.24
## 13: BRIGHTON PARK 0.47 44813 12314 13941 0.83
## 14: BURNSIDE 0.56 2254 965 1315 0.01
## 15: CALUMET HEIGHTS 0.69 13188 5271 6044 0.03
## 16: CHATHAM 0.54 31071 13650 16725 0.01
## 17: CHICAGO LAWN 0.50 53098 16001 18997 0.46
## 18: CLEARING 0.68 25891 8760 9242 0.52
## 19: DOUGLAS 0.57 20781 9590 10786 0.03
## 20: DUNNING 0.72 43689 15616 16624 0.27
## 21: EAST GARFIELD PARK 0.46 20004 6819 8336 0.04
## 22: EAST SIDE 0.61 23771 6843 7817 0.81
## 23: EDGEWATER 0.68 49478 25088 28241 0.17
## 24: EDISON PARK 0.76 11753 4638 4968 0.10
## 25: ENGLEWOOD 0.37 25075 9164 14227 0.03
## 26: FOREST GLEN 0.81 18997 6788 7223 0.14
## 27: FULLER PARK 0.43 2354 1045 1484 0.06
## 28: GAGE PARK 0.51 37943 8819 9762 0.93
## 29: GARFIELD RIDGE 0.74 36452 12179 12847 0.48
## 30: GRAND BOULEVARD 0.52 17100 7911 9546 0.02
## 31: GREATER GRAND CROSSING 0.46 31766 12129 15638 0.01
## 32: HEGEWISCH 0.62 9384 3499 3943 0.57
## 33: HERMOSA 0.51 24144 7022 7792 0.85
## 34: HUMBOLDT PARK 0.45 56427 16813 19627 0.55
## 35: HYDE PARK 0.59 23749 10909 12448 0.09
## 36: IRVING PARK 0.63 54606 20236 22526 0.42
## 37: JEFFERSON PARK 0.72 26808 10359 11174 0.22
## 38: KENWOOD 0.55 11812 5894 6691 0.02
## 39: LAKE VIEW 0.68 81384 40927 44727 0.08
## 40: LINCOLN PARK 0.65 62961 29483 32357 0.06
## 41: LINCOLN SQUARE 0.70 41715 18296 19703 0.16
## 42: LOGAN SQUARE 0.61 73927 29573 32815 0.41
## 43: LOOP 0.56 18469 8553 10218 0.08
## 44: LOWER WEST SIDE 0.44 32888 11947 13716 0.75
## 45: MCKINLEY PARK 0.55 15767 5140 5554 0.60
## 46: MONTCLARE 0.62 13784 4512 5040 0.58
## 47: MORGAN PARK 0.70 22375 8070 8980 0.04
## 48: MOUNT GREENWOOD 0.75 19277 6595 7032 0.09
## 49: NEAR NORTH SIDE 0.57 78237 48148 56904 0.06
## 50: NEAR SOUTH SIDE 0.74 4605 2424 2535 0.02
## 51: NEAR WEST SIDE 0.60 62872 28162 31242 0.09
## 52: NEW CITY 0.39 36421 11098 14165 0.58
## 53: NORTH CENTER 0.69 35789 14488 16227 0.11
## 54: NORTH LAWNDALE 0.42 34069 10653 13795 0.08
## 55: NORTH PARK 0.68 18842 6514 7035 0.17
## 56: NORWOOD PARK 0.78 36936 14750 15757 0.12
## 57: OAKLAND 0.62 3337 1322 1462 0.03
## 58: OHARE 0.63 12139 6015 6536 0.07
## 59: PORTAGE PARK 0.64 64313 22693 25024 0.40
## 60: PULLMAN 0.59 6613 3083 3480 0.06
## 61: RIVERDALE 0.53 7394 2428 3304 0.04
## 62: ROGERS PARK 0.63 49651 22218 25355 0.22
## 63: ROSELAND 0.56 39164 13102 16412 0.01
## 64: SOUTH CHICAGO 0.45 24444 9168 12332 0.18
## 65: SOUTH DEERING 0.61 14614 5009 5841 0.31
## 66: SOUTH LAWNDALE 0.40 74851 17824 20965 0.91
## 67: SOUTH SHORE 0.47 45535 19599 25798 0.01
## 68: UPTOWN 0.67 57973 29341 32599 0.14
## 69: WASHINGTON HEIGHTS 0.66 27453 9570 10470 0.01
## 70: WASHINGTON PARK 0.43 11502 4334 5676 0.02
## 71: WEST ELSDON 0.66 19210 5137 5463 0.82
## 72: WEST ENGLEWOOD 0.42 29929 9642 12889 0.06
## 73: WEST GARFIELD PARK 0.41 17155 5335 7472 0.02
## 74: WEST LAWN 0.64 33108 8988 9807 0.80
## 75: WEST PULLMAN 0.57 27733 9133 11216 0.06
## 76: WEST RIDGE 0.64 76215 25194 28185 0.19
## 77: WEST TOWN 0.59 83621 37127 40475 0.24
## 78: WOODLAWN 0.46 21875 8639 11201 0.03
## Community Resp Pop Occ Households Tot Households Hisp Pct
The final output this time was a map of the Low Response Score, which is the estimated non-response.
##------------------------------------------------------------------------------
## TMAP tract map
##------------------------------------------------------------------------------
library(sf)
## Linking to GEOS 3.7.1, GDAL 2.2.3, PROJ 4.9.3
library(tmap)
shp <- shp_tracts_2020
shp@data <- cbind(shp@data,
pdb[i = match(shp@data$TRACT,
TRACT_2020),
j = list(Low_Response_Score,
Tot_Population_CEN_2010,
Tot_Housing_Units_CEN_2010,
NH_Blk_alone_ACS_13_17,
Hispanic_ACS_13_17,
ENG_VW_ACS_13_17)],
resp_current[i = match(shp@data$TRACT,
TRACT),
j = list(resp = CRRALL/100)])
## convert shape file to tmap format
shp_sf <- st_as_sf(shp, crs = 4326)
popups <- c("NAME", "Low_Response_Score", "Tot_Population_CEN_2010",
"Tot_Housing_Units_CEN_2010", "NH_Blk_alone_ACS_13_17",
"Hispanic_ACS_13_17", "ENG_VW_ACS_13_17")
breaks <- seq(0, 45, by = 5)
map1 <- tm_shape(shp_sf[sf::st_is_valid(shp_sf), ],
is.master = TRUE) +
tm_polygons(col = "Low_Response_Score",
palette = "YlOrRd",
style="cont",
alpha = .7,
title = "Low Response Score",
breaks = breaks,
popup.var = popups,
group = "Relevant Community Area Census Tracts") +
tm_layout(title = "Heatmap of Low Response Scores") +
tm_view(view.legend.position = c("left", "bottom")) +
tm_basemap(server = "OpenMapSurfer.Roads") +
tm_shape(chi_community_areas) +
tm_borders(col = "blue", lwd=3, group = paste0("Community Areas", " Outline"))
tmap_mode(mode = c("plot", "view")[2])
## tmap mode set to interactive viewing
map1
Overall shape of low response score.
hist(shp@data$Low_Response_Score)